
from bert_seq2seq.tokenizer import Tokenizer, load_chinese_base_vocab


if __name__ == "__main__":
    
    # vocab_path = "./vocab.txt"
    vocab_path = "./state_dict/bert-base-chinese-vocab.txt"

    word2ix = load_chinese_base_vocab(vocab_path, simplfied=True)
    print(len(word2ix))


    s1 = "撒打算的阿斗阿斯顿阿斯顿"

    print(s1.index("ds"))